{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a9adc3bf-353c-411e-a471-0e92786e7103",
   "metadata": {},
   "source": [
    "# 使用来自 `tiktoken` 的字节对编码\n",
    "tiktoken是一个用于OpenAI模型的快速BPE标记器。（BPE标记器是一种基于字节对编码（Byte Pair Encoding，简称BPE）的文本标记方法。字节对编码是一种数据压缩技术，但在自然语言处理中，它也被用于创建词汇表和对文本进行分词。）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "4036ffa3-0e5c-433a-a997-4ed7d33de0b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install tiktoken"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "1c490fca-a48a-47fa-a299-322d1a08ad17",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tiktoken version: 0.6.0\n"
     ]
    }
   ],
   "source": [
    "import importlib.metadata\n",
    "# 打印出当前系统中安装的 tiktoken 库的版本号\n",
    "print(\"tiktoken version:\", importlib.metadata.version(\"tiktoken\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "0952667c-ce84-4f21-87db-59f52b44cec4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import tiktoken\n",
    "# 创建一个使用 GPT-2 模型的编码器对象\n",
    "tik_tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
    "# ，定义一个包含文本的字符串变量，使用 tik_tokenizer 对象对文本进行编码\n",
    "text = \"Hello, world. Is this-- a test?\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b039c350-18ad-48fb-8e6a-085702dfc330",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]\n"
     ]
    }
   ],
   "source": [
    "# 参数 allowed_special，该参数指定哪些特殊字符允许出现在编码结果\n",
    "integers = tik_tokenizer.encode(text, allowed_special={\"<|endoftext|>\"})\n",
    "\n",
    "print(integers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "7b152ba4-04d3-41cc-849f-adedcfb8cabb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Hello, world. Is this-- a test?\n"
     ]
    }
   ],
   "source": [
    "# 进行解码\n",
    "strings = tik_tokenizer.decode(integers)\n",
    "\n",
    "print(strings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "cf148a1a-316b-43ec-b7ba-1b6d409ce837",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "50257\n"
     ]
    }
   ],
   "source": [
    "# 表示编码器的词汇表大小\n",
    "print(tik_tokenizer.n_vocab)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6a0b5d4f-2af9-40de-828c-063c4243e771",
   "metadata": {},
   "source": [
    "# 使用在GPT-2中使用的原始字节对编码实现"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "0903108c-65cb-4ae1-967a-2155e25349c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "from bpe_openai_gpt2 import get_encoder, download_vocab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "35dd8d7c-8c12-4b68-941a-0fd05882dd45",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Fetching encoder.json: 1.04Mit [00:02, 502kit/s]                                                    \n",
      "Fetching vocab.bpe: 457kit [00:02, 212kit/s]                                                        \n"
     ]
    }
   ],
   "source": [
    "download_vocab()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "1888a7a9-9c40-4fe0-99b4-ebd20aa1ffd0",
   "metadata": {},
   "outputs": [],
   "source": [
    "orig_tokenizer = get_encoder(model_name=\"gpt2_model\", models_dir=\".\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "2740510c-a78a-4fba-ae18-2b156ba2dfef",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]\n"
     ]
    }
   ],
   "source": [
    "integers = orig_tokenizer.encode(text)\n",
    "\n",
    "print(integers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "434d115e-990d-42ad-88dd-31323a96e10f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Hello, world. Is this-- a test?\n"
     ]
    }
   ],
   "source": [
    "strings = orig_tokenizer.decode(integers)\n",
    "\n",
    "print(strings)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4f63e8c6-707c-4d66-bcf8-dd790647cc86",
   "metadata": {},
   "source": [
    "# 使用HuggingFace Transformers中的BytePair Tokenizer\n",
    "\r\n",
    "\r\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "5bfff386-f725-4137-9c50-e5da0c38bea0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pip install transformers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "e9077bf4-f91f-42ad-ab76-f3d89128510e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'4.33.3'"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import transformers\n",
    "\n",
    "transformers.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "a9839137-b8ea-4a2c-85fc-9a63064cf8c8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ef488b57e4214b76a8913a4704de7e15",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9ab86eb5125640dba6d59a5744f2d927",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "073f03eb3ef541e092c8f344f65c34da",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b92abbedc99a4bb9ad8bf5f3b3e8b140",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from transformers import GPT2Tokenizer\n",
    "# 使用 HuggingFace Transformers 提供的 GPT2Tokenizer 类，创建一个预训练的 GPT-2 模型的标记器对象\n",
    "hf_tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "222cbd69-6a3d-4868-9c1f-421ffc9d5fe1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hf_tokenizer(strings)[\"input_ids\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "907a1ade-3401-4f2e-9017-7f58a60cbd98",
   "metadata": {},
   "source": [
    "# 快速测试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "a61bb445-b151-4a2f-8180-d4004c503754",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('../01_main-chapter-code/the-verdict.txt', 'r', encoding='utf-8') as f:\n",
    "    raw_text = f.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "57f7c0a3-c1fd-4313-af34-68e78eb33653",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "14.6 ms ± 201 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "# 测量其运行时间，从而进行性能评估\n",
    "%timeit orig_tokenizer.encode(raw_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "036dd628-3591-46c9-a5ce-b20b105a8062",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2.9 ms ± 42.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit tik_tokenizer.encode(raw_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "b9c85b58-bfbc-465e-9a7e-477e53d55c90",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Token indices sequence length is longer than the specified maximum sequence length for this model (5145 > 1024). Running this sequence through the model will result in indexing errors\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "28.6 ms ± 643 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit hf_tokenizer(raw_text)[\"input_ids\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "67159770-e131-411a-b9fe-037b4d931c9d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "28.3 ms ± 601 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit hf_tokenizer(raw_text, max_length=5145, truncation=True)[\"input_ids\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1cf5928-b6c8-4493-9e51-cb2795b2482c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f70f164-5f73-479e-bfaf-914243016439",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
